require(ggplot2)
## Loading required package: ggplot2
data(diamonds)
data(mtcars)

Below, I am appending on a logical column to the diamonds data frame. This will be input into our explore function as a test later on.

mtcarsT <- mtcars$vs == 1 #This creates a vector mtcarsT, where each element is either TRUE or FALSE. It is TRUE if the corresponding element of mtcars$vs is equal to 1, and FALSE if it is equal to 0. 
newsample <- sample(mtcarsT, 53940, replace=TRUE) #The sample function allows us to create a new vector by sampling from elements of another vector. Here, we pick 53940 (the amount of elements in diamonds) from the mtcarsT vector with replacement. This gives us 53940 elements, which are either TRUE or FALSE, and whose ratio should be approximately that of the ratio of 1s to 0s in mtcars$vs. 
diamonds$logicalcol <- newsample #This adds a new column, called logicalcol, to diamonds. The column we are adding is the newsample vector, i.e. the vector of TRUE and FALSE created by the sample function. 
explore <- function(df, bin_sizes, corr_threshold){

#PART I 
  n <- df[sapply(df, is.numeric)] #Creates a variable n, which is assigned to the dataframe containing all of our numerical columns. We obtain this data frame by using the sapply function (which applies a function to all elements of our data frame). This function goes through our df one column at a time and determines whether that column contains numeric values (with the is.numeric function). Placing the result of the sapply function in brackets after df, we obtain only the numeric columns of our data frame. 
  for(i in 1:length(n)){ #Goes through each number from 1 to the length of n
    for(j in bin_sizes){ #Goes through each element of bin_sizes
      variab <<- n[[i]] #Assigns the ith value in n to the name variab. 
      print(ggplot(df, aes(x=variab)) + geom_histogram(binwidth = j, fill='steelblue')
      + geom_vline(aes(xintercept=mean(variab), colour = 'red')) + labs(x=names(n[i])))
      #Above, we used the ggplot function to create a histogram of our particular              #variable in n. ggplot(df, aes(x=variab)) indicates that we will be examining data       #from df and using variab as the basis for our data. 
      #We then add a layer to our plot, which is a histogram of binwidth j, and the color       #blue. Then we add another layer using geom_vline. This creates a red line down our
      #histogram that intersects the x axis at the mean value of the variable we are 
      #examining. The final layer added to the plot labels the x axis with the name of
      #our variable. This line creates one such histogram for each bin width j. 
  
      print(ggplot(df, aes(x=variab)) + geom_histogram(aes(y= ..density..), binwidth =
      j, fill='steelblue') + geom_vline(aes(xintercept=mean(variab), colour = 'red'))           + labs(x=names(n[i])))
      #This operates the same as the line above. This time, however, we are creating 
      #density histograms for each bin width j. We do this by changing the input to our 
      #geom_histogram layer. Here we write y=..density.., to indicate that we will
      #be doing so. 
    }
  }  

#PART II   
  f <- df[sapply(df, is.factor)] #Comes up with a list f of categorical columns in our dataframe. We do this with the sapply function, which applies the function is.factor to each of the columns in our dataframe. Those columns for which the function returns true become a part of f. 
  if(length(f)==0){ #If there are no categorical variables, then we print the statement
    print("no categorical variables") #"no categorical variables"
  }
  else{ #If there are categorical variables, then we go through each column 
    for(i in 1:length(f)){ #and make a barplot of it. 
      barplot(table(f[[i]])) #We do this by putting the table function (which comes up with a frequency table for our column) as an input of our barplot function. 
    }
  }
  b <- df[sapply(df, is.logical)] #We do the same thing as above with all logical variables. This time, we put the function is.logical as an input to our function sapply. 
  if(length(b)==0){
    print("no binary variables")
  }
  else{
    for(i in 1:length(b)){
      barplot(table(b[[i]]))
    }
  }
  
  #PART III
  for(i in 1:(length(n)-1)){ #Indexes through every one of our numerical variables (which     #we found before and assigned to the value n), except for the last variable. 
    for(j in (i+1):length(n)){ #For each variable in our index, goes through each variable after it. 
      print((cor(n[i], n[j]))**2) #For each of those pairs of variables, prints the correlation of that pair. The way we have indexed the variables does not allow for any correlations to be repeated. 
    }
  }
  
  #PART IV 
  #FREQUENCY TABLES
  factor_table <- NULL #Creates variable factor_table, which will be appended and returned. 
  for(i in f){ #Goes through every one of our categorical variables (obtained earlier and assigned to the value f.)
    factor_table <- c(factor_table, table(i)) #For each of those variables, adds a frequency table (obtained with the table function) to our list factor_table
  }
  print(factor_table)#Prints our factor_table after it is done. 
  
  binom_table <- NULL #Does the same thing for binomial variables. Just uses our list b of binomial variables rather than f. 
  for(i in b){
    binom_table <- c(binom_table, table(i))
  }
  print(binom_table)
  
  #SUMMARY STATISTICS
  for(i in n){ #Goes through each of our numerical variables
    print(summary(i)) #For each variable, returns a summary statistics table (obtained using the summary function)
    
  
  }
  
  #DATA FRAME WITH ALL VARIABLE PAIRS AND ASSOCIATED R SQUARED
  pairs <- NULL #Creates an empty list pairs. 
  r_squared <- NULL #Creates an empty list r_squared
  for(i in 1:(length(n)-1)){ #Indexes all numerical variables, so that we end up
    for(j in (i+1):length(n)){ #evaluating all possible pairs in n (same as PART III)
      pairs <- c(pairs, paste(names(n[i]), names(n[j]), sep = '-')) #Adds the name of the pair we are on to our pairs list. We use the paste function to create a single string with the two names separated by the character '-'. 
      r_squared <- c(r_squared, cor(n[i],n[j])**2) #Adds the r-squared value between the two variables at hand to the list r_squared. We obtain this value by using the cor function and squaring our result. 
    }
  }
  names_rsquared <- data.frame(pairs, r_squared) #Creates a data frame called names_rsquared. We do this with the data.frame function. We input our appended pairs and r_squared vectors as our two columns. 
  print(names_rsquared) #Print the data frame. 
  
  #DATA FRAME WITH ALL VARIABLE NAMES AND CORRELATIONS IF ABOVE CORR_THRESHOLD
  pairs2 <- NULL #Creates an empty list is called pairs2. 
  Pearson <- NULL #Creates an empty list called Pearson. 
  for(i in 1:(length(n)-1)){ #Indexes through each pair of numerical variables as 
    for(j in (i+1):length(n)){ #done before. 
      if (abs(cor(n[i],n[j]))>corr_threshold){ #If the absolute value of the correlation
        #of our two variables is less than our correlation threshold:
        pairs2 <- c(pairs2, paste(names(n[i]), names(n[j]))) #Add the names of our two variables to our pairs2 vector. 
        Pearson <- c(Pearson, cor(n[i], n[j])) #And add the correlation of those two variables to our Pearson vector. 
      }
    }
  }
  correlationframe <- data.frame(pairs2, Pearson) #Create a new data frame containing the two vectors pairs2 and Pearson. 
  print(correlationframe) #Print the data frame. 
}


explore(diamonds, c(5,20,50), .25)

##              depth
## carat 0.0007966119
##            table
## carat 0.03298493
##           price
## carat 0.8493305
##               x
## carat 0.9508088
##               y
## carat 0.9057751
##               z
## carat 0.9089475
##            table
## depth 0.08748493
##              price
## depth 0.0001133672
##                 x
## depth 0.000639546
##                 y
## depth 0.000860875
##                 z
## depth 0.009010543
##            price
## table 0.01616303
##                x
## table 0.03815939
##                y
## table 0.03376779
##                z
## table 0.02277947
##               x
## price 0.7822256
##               y
## price 0.7489533
##               z
## price 0.7417506
##          y
## x 0.950043
##           z
## x 0.9423979
##           z
## y 0.9063149
##      Fair      Good Very Good   Premium     Ideal         D         E 
##      1610      4906     12082     13791     21551      6775      9797 
##         F         G         H         I         J        I1       SI2 
##      9542     11292      8304      5422      2808       741      9194 
##       SI1       VS2       VS1      VVS2      VVS1        IF 
##     13065     12258      8171      5066      3655      1790 
## FALSE  TRUE 
## 30348 23592 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2000  0.4000  0.7000  0.7979  1.0400  5.0100 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   43.00   61.00   61.80   61.75   62.50   79.00 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   43.00   56.00   57.00   57.46   59.00   95.00 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18820 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   4.710   5.700   5.731   6.540  10.740 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   4.720   5.710   5.735   6.540  58.900 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.910   3.530   3.539   4.040  31.800 
##          pairs    r_squared
## 1  carat-depth 0.0007966119
## 2  carat-table 0.0329849332
## 3  carat-price 0.8493305264
## 4      carat-x 0.9508087510
## 5      carat-y 0.9057751441
## 6      carat-z 0.9089474974
## 7  depth-table 0.0874849338
## 8  depth-price 0.0001133672
## 9      depth-x 0.0006395460
## 10     depth-y 0.0008608750
## 11     depth-z 0.0090105434
## 12 table-price 0.0161630291
## 13     table-x 0.0381593881
## 14     table-y 0.0337677917
## 15     table-z 0.0227794699
## 16     price-x 0.7822255540
## 17     price-y 0.7489533305
## 18     price-z 0.7417506045
## 19         x-y 0.9500429745
## 20         x-z 0.9423978849
## 21         y-z 0.9063148836
##         pairs2    Pearson
## 1  carat price  0.9215913
## 2      carat x  0.9750942
## 3      carat y  0.9517222
## 4      carat z  0.9533874
## 5  depth table -0.2957785
## 6      price x  0.8844352
## 7      price y  0.8654209
## 8      price z  0.8612494
## 9          x y  0.9747015
## 10         x z  0.9707718
## 11         y z  0.9520057
explore(mtcars, c(5,20,50), .25)

## [1] "no categorical variables"
## [1] "no binary variables"
##         cyl
## mpg 0.72618
##          disp
## mpg 0.7183433
##            hp
## mpg 0.6024373
##          drat
## mpg 0.4639952
##            wt
## mpg 0.7528328
##          qsec
## mpg 0.1752963
##            vs
## mpg 0.4409477
##            am
## mpg 0.3597989
##          gear
## mpg 0.2306734
##          carb
## mpg 0.3035184
##          disp
## cyl 0.8136633
##            hp
## cyl 0.6929688
##          drat
## cyl 0.4899134
##            wt
## cyl 0.6122997
##          qsec
## cyl 0.3495672
##            vs
## cyl 0.6574158
##            am
## cyl 0.2731181
##          gear
## cyl 0.2427401
##          carb
## cyl 0.2777167
##             hp
## disp 0.6255997
##           drat
## disp 0.5044038
##             wt
## disp 0.7885083
##           qsec
## disp 0.1880939
##             vs
## disp 0.5046907
##             am
## disp 0.3495494
##           gear
## disp 0.3086571
##           carb
## disp 0.1560067
##         drat
## hp 0.2013847
##           wt
## hp 0.4339488
##         qsec
## hp 0.5015804
##           vs
## hp 0.5228689
##            am
## hp 0.05914831
##          gear
## hp 0.01580156
##         carb
## hp 0.5622187
##             wt
## drat 0.5075717
##             qsec
## drat 0.008318308
##             vs
## drat 0.1938451
##             am
## drat 0.5079572
##           gear
## drat 0.4894543
##             carb
## drat 0.008242788
##          qsec
## wt 0.03052564
##           vs
## wt 0.3079314
##           am
## wt 0.4795497
##         gear
## wt 0.3402237
##         carb
## wt 0.1828468
##            vs
## qsec 0.554333
##              am
## qsec 0.05283602
##            gear
## qsec 0.04523373
##          carb
## qsec 0.430663
##            am
## vs 0.02834008
##          gear
## vs 0.04244562
##         carb
## vs 0.3244523
##         gear
## am 0.6305293
##           carb
## am 0.003310202
##            carb
## gear 0.07511592
## NULL
## NULL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.40   15.42   19.20   20.09   22.80   33.90 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.000   4.000   6.000   6.188   8.000   8.000 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    71.1   120.8   196.3   230.7   326.0   472.0 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    52.0    96.5   123.0   146.7   180.0   335.0 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.760   3.080   3.695   3.597   3.920   4.930 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.513   2.581   3.325   3.217   3.610   5.424 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.50   16.89   17.71   17.85   18.90   22.90 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4375  1.0000  1.0000 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4062  1.0000  1.0000 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   3.000   4.000   3.688   4.000   5.000 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   2.812   4.000   8.000 
##        pairs   r_squared
## 1    mpg-cyl 0.726180005
## 2   mpg-disp 0.718343340
## 3     mpg-hp 0.602437341
## 4   mpg-drat 0.463995168
## 5     mpg-wt 0.752832794
## 6   mpg-qsec 0.175296320
## 7     mpg-vs 0.440947686
## 8     mpg-am 0.359798943
## 9   mpg-gear 0.230673448
## 10  mpg-carb 0.303518437
## 11  cyl-disp 0.813663302
## 12    cyl-hp 0.692968762
## 13  cyl-drat 0.489913363
## 14    cyl-wt 0.612299668
## 15  cyl-qsec 0.349567190
## 16    cyl-vs 0.657415769
## 17    cyl-am 0.273118125
## 18  cyl-gear 0.242740085
## 19  cyl-carb 0.277716662
## 20   disp-hp 0.625599666
## 21 disp-drat 0.504403822
## 22   disp-wt 0.788508342
## 23 disp-qsec 0.188093852
## 24   disp-vs 0.504690738
## 25   disp-am 0.349549413
## 26 disp-gear 0.308657134
## 27 disp-carb 0.156006724
## 28   hp-drat 0.201384745
## 29     hp-wt 0.433948779
## 30   hp-qsec 0.501580369
## 31     hp-vs 0.522868892
## 32     hp-am 0.059148311
## 33   hp-gear 0.015801561
## 34   hp-carb 0.562218742
## 35   drat-wt 0.507571675
## 36 drat-qsec 0.008318308
## 37   drat-vs 0.193845127
## 38   drat-am 0.507957151
## 39 drat-gear 0.489454337
## 40 drat-carb 0.008242788
## 41   wt-qsec 0.030525638
## 42     wt-vs 0.307931409
## 43     wt-am 0.479549684
## 44   wt-gear 0.340223720
## 45   wt-carb 0.182846838
## 46   qsec-vs 0.554333027
## 47   qsec-am 0.052836016
## 48 qsec-gear 0.045233731
## 49 qsec-carb 0.430663050
## 50     vs-am 0.028340081
## 51   vs-gear 0.042445620
## 52   vs-carb 0.324452295
## 53   am-gear 0.630529315
## 54   am-carb 0.003310202
## 55 gear-carb 0.075115920
##       pairs2    Pearson
## 1    mpg cyl -0.8521620
## 2   mpg disp -0.8475514
## 3     mpg hp -0.7761684
## 4   mpg drat  0.6811719
## 5     mpg wt -0.8676594
## 6   mpg qsec  0.4186840
## 7     mpg vs  0.6640389
## 8     mpg am  0.5998324
## 9   mpg gear  0.4802848
## 10  mpg carb -0.5509251
## 11  cyl disp  0.9020329
## 12    cyl hp  0.8324475
## 13  cyl drat -0.6999381
## 14    cyl wt  0.7824958
## 15  cyl qsec -0.5912421
## 16    cyl vs -0.8108118
## 17    cyl am -0.5226070
## 18  cyl gear -0.4926866
## 19  cyl carb  0.5269883
## 20   disp hp  0.7909486
## 21 disp drat -0.7102139
## 22   disp wt  0.8879799
## 23 disp qsec -0.4336979
## 24   disp vs -0.7104159
## 25   disp am -0.5912270
## 26 disp gear -0.5555692
## 27 disp carb  0.3949769
## 28   hp drat -0.4487591
## 29     hp wt  0.6587479
## 30   hp qsec -0.7082234
## 31     hp vs -0.7230967
## 32   hp carb  0.7498125
## 33   drat wt -0.7124406
## 34   drat vs  0.4402785
## 35   drat am  0.7127111
## 36 drat gear  0.6996101
## 37     wt vs -0.5549157
## 38     wt am -0.6924953
## 39   wt gear -0.5832870
## 40   wt carb  0.4276059
## 41   qsec vs  0.7445354
## 42 qsec carb -0.6562492
## 43   vs carb -0.5696071
## 44   am gear  0.7940588
## 45 gear carb  0.2740728